{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "fourth-madonna",
   "metadata": {
    "papermill": {
     "duration": 0.017282,
     "end_time": "2021-04-06T10:20:46.672670",
     "exception": false,
     "start_time": "2021-04-06T10:20:46.655388",
     "status": "completed"
    },
    "tags": []
   },
   "source": [
    "# Samples from a parquet source using ApacheSpark"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "expressed-invalid",
   "metadata": {
    "papermill": {
     "duration": 2.563051,
     "end_time": "2021-04-06T10:20:49.245364",
     "exception": false,
     "start_time": "2021-04-06T10:20:46.682313",
     "status": "completed"
    },
    "tags": []
   },
   "outputs": [],
   "source": [
    "%%bash\n",
    "export version=`python --version |awk '{print $2}' |awk -F\".\" '{print $1$2}'`\n",
    "\n",
    "echo $version\n",
    "\n",
    "if [ $version == '36' ] || [ $version == '37' ]; then\n",
    "    echo 'Starting installation...'\n",
    "    pip3 install pyspark==2.4.8 wget==3.2 pyspark2pmml==0.5.1 > install.log 2> install.log\n",
    "    if [ $? == 0 ]; then\n",
    "        echo 'Please <<RESTART YOUR KERNEL>> (Kernel->Restart Kernel and Clear All Outputs)'\n",
    "    else\n",
    "        echo 'Installation failed, please check log:'\n",
    "        cat install.log\n",
    "    fi\n",
    "elif [ $version == '38' ] || [ $version == '39' ]; then\n",
    "    pip3 install pyspark==3.1.2 wget==3.2 pyspark2pmml==0.5.1 > install.log 2> install.log\n",
    "    if [ $? == 0 ]; then\n",
    "        echo 'Please <<RESTART YOUR KERNEL>> (Kernel->Restart Kernel and Clear All Outputs)'\n",
    "    else\n",
    "        echo 'Installation failed, please check log:'\n",
    "        cat install.log\n",
    "    fi\n",
    "else\n",
    "    echo 'Currently only python 3.6, 3.7 , 3.8 and 3.9 are supported, in case you need a different version please open an issue at https://github.com/IBM/claimed/issues'\n",
    "    exit -1\n",
    "fi"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "superior-advantage",
   "metadata": {
    "papermill": {
     "duration": 0.016731,
     "end_time": "2021-04-06T10:20:49.279025",
     "exception": false,
     "start_time": "2021-04-06T10:20:49.262294",
     "status": "completed"
    },
    "tags": []
   },
   "outputs": [],
   "source": [
    "# @param data_dir temporal data storage for local execution\n",
    "# @param data_parquet path and parquet file name (default: data.parquet)\n",
    "# @param data_parquet_target path and parquet file name\n",
    "# (default: data_sample.parquet)\n",
    "# @param master url of master (default: local mode)\n",
    "# @param sampling_rate (default: 1%)\n",
    "# @param sampling_seed (default: 42)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "adult-banks",
   "metadata": {
    "papermill": {
     "duration": 0.17677,
     "end_time": "2021-04-06T10:20:49.472681",
     "exception": false,
     "start_time": "2021-04-06T10:20:49.295911",
     "status": "completed"
    },
    "tags": []
   },
   "outputs": [],
   "source": [
    "import os\n",
    "from pyspark import SparkConf\n",
    "from pyspark import SparkContext\n",
    "from pyspark.sql import SparkSession\n",
    "import shutil"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "robust-tuesday",
   "metadata": {
    "papermill": {
     "duration": 0.018141,
     "end_time": "2021-04-06T10:20:49.498206",
     "exception": false,
     "start_time": "2021-04-06T10:20:49.480065",
     "status": "completed"
    },
    "tags": []
   },
   "outputs": [],
   "source": [
    "data_parquet = os.environ.get('data_parquet', 'data.parquet')\n",
    "data_parquet_target = os.environ.get(\n",
    "    'data_parquet_target', 'data_sample.parquet')\n",
    "master = os.environ.get('master', \"local[*]\")\n",
    "data_dir = os.environ.get('data_dir', '../../data/')\n",
    "sampling_rate = float(os.environ.get('sampling_rate', 0.1))\n",
    "sampling_seed = int(os.environ.get('sampling_rate', 42))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "liked-going",
   "metadata": {},
   "outputs": [],
   "source": [
    "skip = False\n",
    "\n",
    "if os.path.exists(data_dir + data_parquet_target):\n",
    "    skip = True"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "floppy-canadian",
   "metadata": {
    "papermill": {
     "duration": 10.111445,
     "end_time": "2021-04-06T10:20:59.618326",
     "exception": false,
     "start_time": "2021-04-06T10:20:49.506881",
     "status": "completed"
    },
    "tags": []
   },
   "outputs": [],
   "source": [
    "if not skip:\n",
    "    sc = SparkContext.getOrCreate(SparkConf().setMaster(master))\n",
    "    spark = SparkSession.builder.getOrCreate()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "occupied-lying",
   "metadata": {
    "papermill": {
     "duration": 6.702477,
     "end_time": "2021-04-06T10:21:06.329699",
     "exception": false,
     "start_time": "2021-04-06T10:20:59.627222",
     "status": "completed"
    },
    "tags": []
   },
   "outputs": [],
   "source": [
    "if not skip:\n",
    "    df = spark.read.parquet(data_dir + data_parquet)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "domestic-biodiversity",
   "metadata": {
    "papermill": {
     "duration": 0.072413,
     "end_time": "2021-04-06T10:21:06.408685",
     "exception": false,
     "start_time": "2021-04-06T10:21:06.336272",
     "status": "completed"
    },
    "tags": []
   },
   "outputs": [],
   "source": [
    "if not skip:\n",
    "    df = df.sample(sampling_rate, sampling_seed)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "incoming-bobby",
   "metadata": {},
   "outputs": [],
   "source": [
    "if not skip:\n",
    "    shutil.rmtree(data_dir + data_parquet_target, ignore_errors=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "hairy-zambia",
   "metadata": {
    "papermill": {
     "duration": 0.724797,
     "end_time": "2021-04-06T10:21:07.142732",
     "exception": true,
     "start_time": "2021-04-06T10:21:06.417935",
     "status": "failed"
    },
    "tags": []
   },
   "outputs": [],
   "source": [
    "if not skip:\n",
    "    df.write.parquet(data_dir + data_parquet_target)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.6"
  },
  "papermill": {
   "default_parameters": {},
   "duration": 21.56499,
   "end_time": "2021-04-06T10:21:07.466914",
   "environment_variables": {},
   "exception": true,
   "input_path": "/home/jovyan/work/examples/pipelines/pairs/component-library/filter/spark-sample.ipynb",
   "output_path": "/home/jovyan/work/examples/pipelines/pairs/component-library/filter/spark-sample.ipynb",
   "parameters": {},
   "start_time": "2021-04-06T10:20:45.901924",
   "version": "2.3.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
